! (C) Copyright 1995- ECMWF.
! (C) Copyright 1995- Meteo-France.
! (C) Copyright 2022- NVIDIA.
!
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRLTOM_MOD
  USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
  IMPLICIT NONE

  PRIVATE
  PUBLIC :: TRLTOM, PREPARE_TRLTOM, TRLTOM_HANDLE

  TYPE TRLTOM_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF
  END TYPE
CONTAINS
  FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT
    USE TPM_DISTR,              ONLY: D
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
    USE ISO_C_BINDING,          ONLY: C_SIZE_T

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
    TYPE(TRLTOM_HANDLE) :: HTRLTOM

    REAL(KIND=JPRBT) :: DUMMY

    HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, int(D%NLENGT1B*2*KF_FS*SIZEOF(DUMMY),kind=c_size_t))
  END FUNCTION

  SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
    !**** *TRLTOM * - transposition in Fourierspace

    !     Purpose.
    !     --------
    !              Transpose Fourier coefficients from partitioning
    !              over latitudes to partitioning over wave numbers
    !              This is done between inverse Legendre Transform
    !              and inverse FFT.
    !              This is the inverse routine of TRMTOL.

    !**   Interface.
    !     ----------
    !        *CALL* *TRLTOM(...)*

    !        Explicit arguments : PFBUF  - Fourier coefficient buffer. It is
    !        --------------------          used for both input and output.

    !                             KF_FS - Number of fields communicated

    !        Implicit arguments :
    !        --------------------

    !     Method.
    !     -------
    !        See documentation

    !     Externals.
    !     ----------

    !     Reference.
    !     ----------
    !        ECMWF Research Department documentation of the IFS

    !     Author.
    !     -------
    !        MPP Group *ECMWF*

    !     Modifications.
    !     --------------
    !        Original : 95-10-01
    !        Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use
    !                                            (NCOMBFLEN) for nphase.eq.1
    !        Modified : 99-05-28  D.Salmond - Optimise copies.
    !        Modified : 00-02-02  M.Hamrud  - Remove NPHASE
    !        D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message
    !                             passing and buffer packing
    !        G.Mozdzynski: 08-01-01 Cleanup
    !        Y.Seity   : 07-08-30 Add barrier synchronisation under LSYNC_TRANS
    !     ------------------------------------------------------------------

    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT
    USE YOMHOOK,                ONLY: LHOOK, DR_HOOK, JPHOOK
    USE MPL_MODULE,             ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK
    USE TPM_DISTR,              ONLY: D, NPRTRW, NPROC, MYPROC, MYSETW
    USE TPM_GEN,                ONLY: LSYNC_TRANS, NERR
#if ECTRANS_HAVE_MPI
    USE MPI_F08,                ONLY: MPI_COMM, MPI_FLOAT, MPI_DOUBLE
    ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157)
#endif
    USE TPM_STATS,              ONLY: GSTATS => GSTATS_NVTX
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE ISO_C_BINDING,          ONLY: C_SIZE_T
    USE ABORT_TRANS_MOD,        ONLY: ABORT_TRANS

    IMPLICIT NONE

    INTEGER(KIND=JPIM) ,INTENT(IN)  :: KF_FS
    REAL(KIND=JPRBT)   ,INTENT(OUT), POINTER  :: PFBUF(:)
    REAL(KIND=JPRBT)   ,INTENT(INOUT), POINTER :: PFBUF_IN(:)

    INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW)
    INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK
    REAL(KIND=JPHOOK) :: ZHOOK_HANDLE
    INTEGER(KIND=JPIM) :: IERROR

    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRLTOM_HANDLE), INTENT(IN) :: HTRLTOM
#if ECTRANS_HAVE_MPI
    TYPE(MPI_COMM) :: LOCAL_COMM
#endif

#ifdef PARKINDTRANS_SINGLE
#define TRLTOM_DTYPE MPI_FLOAT
#else
#define TRLTOM_DTYPE MPI_DOUBLE
#endif

#if ECTRANS_HAVE_MPI
    LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM
#endif

    IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE)

    CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),&
        & 1_C_SIZE_T, int(D%NLENGT1B*2*KF_FS*SIZEOF(PFBUF(1)),kind=c_size_t))

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA PRESENT(PFBUF,PFBUF_IN)
#endif

    IF(NPROC > 1) THEN
      DO J=1,NPRTRW
        ILENS(J) = D%NLTSGTB(J)*2*KF_FS
        IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS
        ILENR(J) = D%NLTSFTB(J)*2*KF_FS
        IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS
      ENDDO

      CALL GSTATS(806,0)

      ! copy to self workaround
      IRANK = MPL_MYRANK(MPL_ALL_MS_COMM)
      IF (ILENS(IRANK) .ne. ILENR(IRANK)) THEN
          WRITE(NERR,*) "ERROR", ILENS(IRANK), ILENR(IRANK)
          CALL ABORT_TRANS("TRLTOM: Error - ILENS(IRANK) /= ILENR(IRANK)")
      ENDIF
      IF (ILENS(IRANK) > 0) THEN
          FROM_SEND = IOFFS(IRANK) + 1
          TO_SEND = FROM_SEND + ILENS(IRANK) - 1
          FROM_RECV = IOFFR(IRANK) + 1
          TO_RECV = FROM_RECV + ILENR(IRANK) - 1
#ifdef OMPGPU
#endif
#ifdef ACCGPU
          !$ACC KERNELS ASYNC(1)
#endif
          PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
          !$ACC END KERNELS
#endif
          ILENS(IRANK) = 0
          ILENR(IRANK) = 0
      ENDIF

      IF (LSYNC_TRANS) THEN
        CALL GSTATS(430,0)
        CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='')
        CALL GSTATS(430,1)
      ENDIF
      CALL GSTATS(411,0)
#ifdef USE_GPU_AWARE_MPI
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF)
#endif
#else
    !! this is safe-but-slow fallback for running without GPU-aware MPI
    !$ACC UPDATE HOST(PFBUF_IN,PFBUF)
#endif
#if ECTRANS_HAVE_MPI
      CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,&
       & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, &
       & LOCAL_COMM,IERROR)
#else
      CALL ABORT_TRANS("Should not be here: MPI is disabled")
#endif
#ifdef USE_GPU_AWARE_MPI
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC END HOST_DATA
#endif
#else
    !! this is safe-but-slow fallback for running without GPU-aware MPI
    !$ACC UPDATE DEVICE(PFBUF)
#endif
      IF (LSYNC_TRANS) THEN
        CALL GSTATS(431,0)
        CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='')
        CALL GSTATS(431,1)
      ENDIF
      CALL GSTATS(411,1)

#ifdef ACCGPU
      !$ACC WAIT(1)
#endif
      CALL GSTATS(806,1)
    ELSE
      ILEN = D%NLTSGTB(MYSETW)*2*KF_FS
      ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1
      CALL GSTATS(1607,0)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC PARALLEL LOOP DEFAULT(NONE) FIRSTPRIVATE(ISTA,ILEN)
#endif
      DO J=ISTA,ISTA+ILEN-1
        PFBUF(J) = PFBUF_IN(J)
      ENDDO
      CALL GSTATS(1607,1)
    ENDIF

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END DATA
#endif

    IF (LHOOK) CALL DR_HOOK('TRLTOM',1,ZHOOK_HANDLE)
    !     ------------------------------------------------------------------
  END SUBROUTINE TRLTOM
END MODULE TRLTOM_MOD
